library(tidyverse)
library(leaflet)

Reduce number of categories

Number of categories were reduced from 39 to 15 by combining common crime types under one category. This reduction in detail is to help gain quick insights while plotting the data. It is not expected to significantly affect the analysis.

Categories <- read_csv("Categories.csv")

incidents_house_price dataset

Load the incidents_house_price dataset from the previous data cleaning exercise.

incidents_house_price <- read_csv("incidents_house_price.csv")

Reduce categories in incidents_house_price dataset

incidents_new_categories <- left_join(incidents_house_price, Categories, by = "Category") %>%
  select(-n) %>%
  rename(New_Category = `New Category`) 

Extract only features useful for plotting on a map

zipc <- incidents_new_categories %>% 
  select(zipcode, New_Category, latitude, longitude) %>%
  count(zipcode, New_Category, latitude, longitude) %>%
  arrange(desc(n))
filt_zipc <- zipc %>% filter(n > 10) %>%
  mutate(leaflet_labels = paste0(New_Category, " (", n, ")")) %>%
  mutate(leaflet_radius = findInterval(n, c(50, 100, 200, 300, 400, 500, 600, 
                                            700, 800, 900, 1000, 2000, 5000, 10000)))

filt_zipc$New_Category <- as.factor(filt_zipc$New_Category)

Number of crimes by category

The number of crimes for each category are shown on a map of San Francisco using the leaflet library. This provides a spatial visualization of areas with high concentration of crimes and crime-types. The tool tip indicates the category followed by the number of crimes in parenthesis ().

col_pal <- colorFactor(palette = "magma", levels = 
                         levels(filt_zipc$New_Category))

leaflet() %>%
  setView(lng = -122.4164, lat = 37.7766, zoom = 12) %>%
  addTiles() %>%
  addCircleMarkers(filt_zipc, lng = filt_zipc$longitude, lat = filt_zipc$latitude, 
                   weight = 5, radius = filt_zipc$leaflet_radius * 1.5, fillOpacity = 0.8,
                   color = col_pal(filt_zipc$New_Category),
                   label = filt_zipc$leaflet_labels) %>%
  addLegend("topright", col_pal, values =  filt_zipc$New_Category)

## Number of crimes for each category The bar plot below provides a relative comparison of the number of crimes for each category.

incidents_new_categories  %>%
  ggplot(aes(x = New_Category)) +
  geom_bar() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_y_continuous(labels=function(n){format(n, scientific = FALSE)},
                     breaks = seq(0,500000,by = 25000)) + 
  labs(x = "Categories of crime", y = "Number of crimes") + 
  coord_flip()

Incidents by day

The pie chart below is to assess the variation in the number of crimes by day of the week.

incidents_by_day <- incidents_new_categories %>%
                    select(DayOfWeek, New_Category) %>%
                    count(DayOfWeek) 
pie <- incidents_by_day %>% 
       ggplot(aes(x = factor(1), y = n, fill = DayOfWeek)) + 
       geom_bar(width = 1,stat="identity", color = "black") + 
       guides(fill=guide_legend(override.aes=list(colour=NA))) +
       coord_polar(theta = "y") + 
       theme(axis.ticks=element_blank(),  
             axis.title=element_blank(),  
             axis.text.y=element_blank()) 

y.breaks <- cumsum(incidents_by_day$n) - incidents_by_day$n/2

pie + 
  theme(axis.text.x=element_text(color='black')) +
  scale_y_continuous(breaks=y.breaks, labels=incidents_by_day$DayOfWeek) + 
  geom_text(aes(y=y.breaks, label = n), size=3)

Incidents by month and year

The heatmap below shows the number of crimes by month and year. It allows for easy visualization of hot-spots, i.e., month-year combinations with high crime-rates.

incidents_by_month <- incidents_new_categories %>%
                     select(year, month) %>%
                     count(year, month)
incidents_by_month %>%
  mutate_at(c("month"), as.factor) %>%
  mutate_at(c("year"), as.factor) %>%
  filter(n > 7000) %>%
  ggplot(aes(x = month, y = year)) +
    geom_tile(aes(fill = n)) + 
    scale_fill_gradient(low = "white", high = "darkred") +
    labs(x = "Months", y = "Number of crimes") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0)) + 
    scale_x_discrete(labels=month.name) 

Number of incidents resolved by each PD district

The bar plot below shows the number of incidents resolved for each PD district. The visualization allows for easy comparison of the relative resolution of the number of cases for each district.

resolution <- incidents_new_categories %>%
  select(New_Category, PdDistrict,Resolution) %>%
  mutate(Resolved = ifelse(Resolution == "NONE" , 0, 1))
resolution %>%
  mutate(Resolved = ifelse(Resolved == 0, "N", "Y")) %>%
  mutate_at(c("Resolved"), .funs = as.factor) %>%
  ggplot(aes(x = Resolved)) + 
  geom_bar(aes(color = Resolved, fill = Resolved)) + 
  facet_grid(~PdDistrict) +
  scale_fill_manual(values = c("DarkRed", "DarkGreen")) +
  scale_color_manual(values = c("DarkRed", "DarkGreen"))  +
  theme_bw() +
  theme(legend.position="none")